Isolating cell subsets with scGate: a de novo analysis
Preliminars
Background
scGate is an open source tool designed for gating specific and uncontaminated cell types in a scRNAseq experiment. We aim to gate cell subtypes at high precision/PPV (i.e achieving high cell type purity) and we don’t care about some loss of true cells (i.e. sensitivity > 90% is ideal). We aim at a reliable tool to purify individual cell types from scRNA-seq data from complex samples (e.g. whole tissue, whole tumor) to enable i) downstream cell type-specific analysis (e.g. reference-atlas projection) and ii) cross-study comparisons/meta-analysis of cell types.
This demo
In previous vignettes (tcell.html and tced8.html), we show basic functionality and usage of scGate package over well annotated datasets. In this demo, we present a more realistic situation, namely, when cell types are not known a priori and you are interested in usage scGate for de-novo isolation of any cellular subset of interest. We will work with a mouse dataset Gubin 2018, Yost.2019
R Environment
Using Renv
if (!requireNamespace("renv")) install.packages("renv")
renv::activate()
renv::restore()
library(gridExtra)
library(ggplot2)
library(reshape2)
library(GEOquery)
library(patchwork)
library(Seurat)
library(cowplot)Or alternatively, manually installing scGate from GitHub
if (!requireNamespace("remotes")) install.packages("remotes")
remotes::install_github("carmonalab/UCell")
remotes::install_github("carmonalab/scGate")
library(scGate)
if (!requireNamespace("GEOquery")){
install.packages("BiocManager")
BiocManager::install("GEOquery")
}Download and process Gubin mouse dataset
Brief description
geo_acc <- "GSE119352"
do_download <- T
gse <- getGEO(geo_acc)
datadir <- "input/Gubin"
dir.create(datadir)
series <- paste0(geo_acc, "_series_matrix.txt.gz")
if (do_download) {
getGEOSuppFiles(geo_acc, baseDir = datadir)
system(paste0("gunzip ", datadir, "/", geo_acc, "/*meta_data.tsv.gz"))
for (acc in gse[[series]]$geo_accession) {
dir.create(file.path(datadir, acc))
getGEOSuppFiles(acc, baseDir = datadir) #untar...
fns <- list.files(paste0(datadir, "/", acc, "/"))
for (fn in fns) {
# rename to basic names: matrix, genes, barcodes
system(paste0("mv ", paste0(datadir, "/", acc, "/", fn), " ", paste0(datadir,
"/", acc, "/", sub(".*_", "", fn, perl = T))))
}
system(paste0("gunzip ", datadir, "/", acc, "/*gz"))
}
}
# as.character(gse[[1]]$title[gse[[1]]$geo_accession==acc])
geoAccToName <- gse[[1]]$title
names(geoAccToName) <- gse[[1]]$geo_accessionRead the sparse count matrices
data.list <- list()
for (i in 1:length(gse[[series]]$geo_accession)) {
acc = gse[[series]]$geo_accession[i]
fname <- paste0(datadir, "/", acc)
data <- Read10X(fname)
title <- gse[[1]]$title[i]
c <- gsub(pattern = "(\\S+)", replacement = paste0("GU-\\1-", i), colnames(data),
perl = TRUE)
colnames(data) <- c
tmp.seurat <- CreateSeuratObject(counts = data, project = "Gubin", min.cells = 3,
min.features = 50)
tmp.seurat@meta.data$Sample <- acc
tmp.seurat@meta.data$Condition <- title
data.list[[title]] <- tmp.seurat
rm(tmp.seurat)
}
names(data.list)
data.list[[1]]$Treatment <- "Control"
data.list[[2]]$Treatment <- "aPD1"
data.list[[3]]$Treatment <- "aCTLA4"
data.list[[4]]$Treatment <- "aPD1_aCTLA4"
data.seurat.raw <- Reduce(merge, data.list)Data Preprocessing
# basic information of generated seurat object
data.seurat.raw
An object of class Seurat
14490 features across 14618 samples within 1 assay
Active assay: RNA (14490 features, 0 variable features)
# Condition composition
table(data.seurat.raw$Condition)
CD45+ cells from tumor microenvironment of aCTLA4 treated mice
3372
CD45+ cells from tumor microenvironment of aPD1 and aCTLA4 treated mice
3966
CD45+ cells from tumor microenvironment of aPD1 treated mice
3440
CD45+ cells from tumor microenvironment of IgG2a isotype control antibody treated mice
3840
# Sample composition
table(data.seurat.raw$Sample)
GSM3371684 GSM3371685 GSM3371686 GSM3371687
3840 3440 3372 3966
# Treatment composition
table(data.seurat.raw$Treatment)
aCTLA4 aPD1 aPD1_aCTLA4 Control
3372 3440 3966 3840 Ribosomal and mitochondrial content
percent.ribo.dv <- PercentageFeatureSet(data.seurat.raw, pattern = "^Rp[ls]")
percent.mito.dv <- PercentageFeatureSet(data.seurat.raw, pattern = "^mt-")
data.seurat.raw <- AddMetaData(data.seurat.raw, metadata = percent.ribo.dv, col.name = "percent.ribo")
data.seurat.raw <- AddMetaData(data.seurat.raw, metadata = percent.mito.dv, col.name = "percent.mito")Look at distributions per sample
pl <- VlnPlot(data.seurat.raw, features = c("nFeature_RNA", "nCount_RNA", "percent.ribo",
"percent.mito"), ncol = 4, pt.size = 0, combine = FALSE, group.by = "Treatment")
pll <- list()
for (i in seq_along(pl)) {
pll[[i]] = pl[[i]] + theme(axis.text.x = element_blank(), legend.position = "none")
}
plot_grid(plotlist = pll, ncol = 4)## nFeatures
summary(data.seurat.raw@meta.data$nFeature_RNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
166 770 1088 1178 1468 6972
## nCounts
summary(data.seurat.raw@meta.data$nCount_RNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
792 1700 2834 3432 4414 78677
# % ribo
summary(data.seurat.raw@meta.data$percent.ribo)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2381 11.6778 16.2097 18.1720 24.1830 65.9930
# % mito
summary(data.seurat.raw@meta.data$percent.mito)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.6066 0.8999 1.0833 1.2931 92.1209 Basic quality control
# before filtering
sprintf("%s cells before filtering", dim(data.seurat.raw)[2])
[1] "14618 cells before filtering"
data.seurat <- subset(data.seurat.raw, subset = nFeature_RNA > 500 & nFeature_RNA <
4000 & nCount_RNA > 1000 & nCount_RNA < 15000 & percent.ribo < 50 & percent.mito <
10)
# after filtering
sprintf("%s cells after filtering", dim(data.seurat)[2])
[1] "13690 cells after filtering"Data normalization and unsupervised analysis
ndim = 30
set.seed(1234)
data.seurat <- NormalizeData(data.seurat, verbose = FALSE)
data.seurat <- FindVariableFeatures(data.seurat, selection.method = "vst", nfeatures = 1000,
verbose = FALSE)
bk.list <- unlist(scGate::genes.blacklist.Mm)
data.seurat@assays$RNA@var.features <- setdiff(data.seurat@assays$RNA@var.features,
bk.list)
data.seurat <- ScaleData(data.seurat)
data.seurat <- RunPCA(data.seurat, features = data.seurat@assays$RNA@var.features,
ndims.print = 1:5, nfeatures.print = 5, npcs = ndim)
## Run Umap
data.seurat <- RunUMAP(data.seurat, reduction = "pca", dims = 1:ndim, seed.use = 123)2D visualization using UMAP
DimPlot(data.seurat, reduction = "umap", group.by = "Treatment") + ggtitle("UMAP by sample")Plot some clasical markers
features = c("Ptprc", "Lck", "Csf1r", "Cd3e", "Foxp3", "Cd8a", "Cd4", "H2-Eb1", "H2-Aa",
"Cd300e", "Itgal", "C1qa", "C1qb", "Gpnmb", "Fabp5", "Saa3", "Clec10a", "Irf8",
"Ccr7", "S100a9", "Plac8", "Xcr1", "Spi1", "Siglech", "Ccr9", "nCount_RNA", "nFeature_RNA")
plt.classical.mkr <- FeaturePlot(data.seurat, reduction = "umap", features = features,
order = T, ncol = 4)
plot(plt.classical.mkr) Display some selected markers
features = c("Lck", "Csf1r", "Spi1", "Cd3e", "H2-Eb1", "Ccr7", "S100a9", "Xcr1",
"Siglech")
plt.selected.mkr <- FeaturePlot(data.seurat, reduction = "umap", features = features,
order = T, ncol = 3, max.cutoff = "q99")
plot(plt.selected.mkr)Export full dataset
dir.create("aux")
saveRDS(data.seurat, file = "aux/Gubin_CD45_clean.rds")scGate for isolating cell subsets
Filter on T cells
ffile = "./aux/tcell.gubin.rds"
if (file.exists(ffile)) {
gate = F
data.T <- readRDS(ffile)
} else {
gate = T
}data.T <- scGate(data.seurat, gating.model = scGate_DB$mouse$Tcell, sd.out = 5)
saveRDS(data.T, ffile)plt.ispure <- DimPlot(data.T, group.by = "is.pure") + theme(aspect.ratio = 1)
plt.ann <- DimPlot(data.T, group.by = "scGate.annotation") + theme(aspect.ratio = 1)
features = c("Lck", "Cd3e", "Fcer1g")
lymphoid <- FeaturePlot(data.T, reduction = "umap", ncol = 3, features = features,
order = T, max.cutoff = "q99") + theme(aspect.ratio = 1)
layout <- "
AAAAAAA
BBB#CCC"
plt <- wrap_plots(A = lymphoid, B = plt.ispure, C = plt.ann, design = layout)
pltFilter myeloid APCs
scGate APC model
ffile = "./aux/apc.gubin.rds"
if (file.exists(ffile)) {
gate = F
data.myel.scgate <- readRDS(ffile)
} else {
gate = T
}data.myel.scgate <- scGate(data.seurat, gating.model = scGate_DB$mouse$Monocyte_Macrophage_Dendritic_pDendritic,
sd.in = 3, sd.out = 7)
saveRDS(data.myel.scgate, ffile)plt.ispure <- DimPlot(data.myel.scgate, group.by = "is.pure")
plt.ann <- DimPlot(data.myel.scgate, group.by = "scGate.annotation")
features = c("Csf1r", "Spi1", "H2-Eb1", "S100a9")
apcs <- FeaturePlot(data.seurat, reduction = "umap", ncol = 4, features = features,
order = T, max.cutoff = "q99")
layout <- "
AAAAA
BB#CC"
plt <- wrap_plots(A = apcs, B = plt.ispure, C = plt.ann, design = layout)
pltProject into myeloid atlas Load atlas
#Project Gubin data into reference myeloid atlas